Updates to Machine Learning for Pediatric Obesity

08 March 2018

Accomplished since last meeting

  • Another 1900 addresses were able to be geocoded
  • Relative coordinates were converted to x,y pairs and a model ready data set was created
  • Progressive time models completed
  • High risk kids (9 boys and 7 girls) from the sample models were extracted for to get their growth curves
  • Sample figures for the paper are created
  • A function to create table outputs for summary statistics of key features is created
  • Acuracy comparisons for the best models compared to Mary Jo's original and updated inclusion criteria

BMI Percentile Distribution for All Children in the Cohort

In [4]:
fig, ax = plt.subplots(figsize=(12,8))
index = np.arange(len(bins))
bar_width = 0.4
opacity = 0.8

bar1 = plt.bar(index, boys_bins, bar_width, alpha=opacity,
               label='Boys (n={0:,.0f}; $\mu$={1:,.2f})'.format(np.sum(boys_bins), np.mean(boys)))
 
bar2 = plt.bar(index + bar_width, girls_bins, bar_width, alpha=opacity,
               label='Girls (n={0:,.0f}; $\mu$={1:,.2f})'.format(np.sum(girls_bins), np.mean(girls)))

# plt.xlabel('Obesity Percentiles', fontsize=14)
plt.ylabel('Number of Children', fontsize=14)
plt.title('BMI Percentile Distribution at 4.5 to 5.5 years - No Exclusions', fontsize=20)
# plt.xticks(index + bar_width, ['{0:.1f}%'.format(b * 100) for b in bins])
plt.xticks(index, bin_names, rotation=30)
ax.yaxis.grid(linestyle='--')
ax.set_axisbelow(True)
plt.legend(fontsize=12)
 
plt.tight_layout()
# plt.savefig('../outputs_age_analyses20180221/no_exclusions_bmi_percentiles.png', dpi=96)
plt.show()

Growth Curves for the 16 Highest Risk Kids in the Data Set

In [9]:
train.plot_growth_curve(d1,None,keys[0], hide_mrn=True)
In [10]:
train.plot_growth_curve(d1,None,keys[1], hide_mrn=True)
In [11]:
train.plot_growth_curve(d1,None,keys[2], hide_mrn=True)
In [12]:
train.plot_growth_curve(d1,None,keys[3], hide_mrn=True)
In [13]:
train.plot_growth_curve(d1,None,keys[4], hide_mrn=True)
In [14]:
train.plot_growth_curve(d1,None,keys[5], hide_mrn=True)
In [15]:
train.plot_growth_curve(d1,None,keys[6], hide_mrn=True)
In [16]:
train.plot_growth_curve(d1,None,keys[7], hide_mrn=True)
In [17]:
train.plot_growth_curve(d1,None,keys[8], hide_mrn=True)
In [18]:
train.plot_growth_curve(d1,None,keys[9], hide_mrn=True)
In [19]:
train.plot_growth_curve(d1,None,keys[10], hide_mrn=True)
In [20]:
train.plot_growth_curve(d1,None,keys[11], hide_mrn=True)
In [21]:
train.plot_growth_curve(d1,None,keys[12], hide_mrn=True)
In [22]:
train.plot_growth_curve(d1,None,keys[13], hide_mrn=True)
In [23]:
train.plot_growth_curve(d1,None,keys[14], hide_mrn=True)
In [24]:
train.plot_growth_curve(d1,None,keys[15], hide_mrn=True)

Results for the most predictive feature: 'Vital: BMI-latest'

In [26]:
modelix = 'BMI'
plt.figure(figsize=(9,9))
for ix in range(len(prec_total)):
    if modelix not in titles_total[ix] or 'randomforest' in titles_total[ix] or 'gradientboost' in titles_total[ix]:
        continue
    if 'girls' in titles_total[ix]:
        plt.plot(1- np.array(spec_total[ix]), np.array(recall_total[ix]), linestyle='-', label=titles_total[ix]+' - AUC={:0.2f}'.format(auc_list[ix][0]))
    else:
        plt.plot(1- np.array(spec_total[ix]), np.array(recall_total[ix]), linestyle='--', label=titles_total[ix]+' - AUC={:0.2f}'.format(auc_list[ix][0]))

plt.legend(fontsize = 10)
plt.xlabel('1 - Specificity', fontsize=12)
plt.ylabel('Sensitivity', fontsize=12)
plt.axis('equal')
plt.title('ROC Curve: Obesity Predicted at 5 years - "Vital: BMI-latest"', fontsize=14)
plt.grid(True)
plt.tight_layout()
# plt.savefig(newdir+'/Pediatric_Girls_ROC.png', dpi=300)
plt.show()
In [27]:
plt.figure(figsize=(9,9))
for ix in range(len(prec_total)):
    if modelix not in titles_total[ix] or 'randomforest' in titles_total[ix] or 'gradientboost' in titles_total[ix]:
        continue
    if 'girls' in titles_total[ix]:
        plt.plot(recall_total[ix], prec_total[ix], linestyle='-', label=titles_total[ix]+' - AUC={:0.2f}'.format(auc_list[ix][0]))
    else:
        plt.plot(recall_total[ix], prec_total[ix], linestyle='--', label=titles_total[ix]+' - AUC={:0.2f}'.format(auc_list[ix][0]))
plt.xlabel('Recall (Sensitivity)', fontsize=14)
plt.ylabel('Precision (PPV)', fontsize=14)
plt.legend(fontsize = 8)
plt.axis('equal')
plt.title('Precision-Recall Curve: Obesity Predicted at 5 years - "Vital: BMI-latest"', fontsize=14)
plt.grid()
plt.tight_layout()
# plt.savefig(newdir+'/Pediatric_Girls_PR.png', dpi=300)
plt.show()

LASSO and Random Forest Comparison for Boys and Girls

In [28]:
plt.figure(figsize=(9,9))
for ix in range(len(prec_total)):
    if any(x in titles_total[ix] for x in ('w/o vitals','Wt','no_maternal','w/o exclusions', 'BMI','gradientboost')):
        continue
    if 'girls' in titles_total[ix]:
        plt.plot(1- np.array(spec_total[ix]), np.array(recall_total[ix]), linestyle='-', label=titles_total[ix]+' - AUC={:0.2f}'.format(auc_list[ix][0]))
    else:
        plt.plot(1- np.array(spec_total[ix]), np.array(recall_total[ix]), linestyle='--', label=titles_total[ix]+' - AUC={:0.2f}'.format(auc_list[ix][0]))

plt.legend(fontsize = 9)
plt.xlabel('1 - Specificity', fontsize = 14)
plt.ylabel('Sensitivity', fontsize = 14)
plt.axis('equal')
plt.title('ROC Curve: Obesity Predicted at 5 years', fontsize = 18)
plt.grid(True)
plt.tight_layout()
# plt.savefig(newdir+'/Pediatric_Girls_ROC.png', dpi=300)
plt.show()
In [29]:
plt.figure(figsize=(9,9))
for ix in range(len(prec_total)):
    if any(x in titles_total[ix] for x in ('w/o vitals','Wt','no_maternal','w/o exclusions', 'BMI','gradientboost')):
        continue
    if 'girls' in titles_total[ix]:
        plt.plot(recall_total[ix], prec_total[ix], linestyle='-', label=titles_total[ix]+' - AUC={:0.2f}'.format(auc_list[ix][0]))
    else:
        plt.plot(recall_total[ix], prec_total[ix], linestyle='--', label=titles_total[ix]+' - AUC={:0.2f}'.format(auc_list[ix][0]))
plt.xlabel('Recall (Sensitivity)', fontsize = 14)
plt.ylabel('Precision (PPV)', fontsize = 14)
plt.legend(fontsize = 8)
plt.axis('equal')
plt.title('Precision-Recall Curve: Obesity Predicted at 5 years', fontsize = 18)
plt.grid()
plt.tight_layout()
# plt.savefig(newdir+'/Pediatric_Girls_PR.png', dpi=300)
plt.show()
In [30]:
plt.figure(figsize=(18,9))
plt.subplot(1, 2, 1)
for ix in range(len(prec_total)):
    if any(x in titles_total[ix] for x in ('w/o vitals','Wt','no_maternal','w/o exclusions', 'BMI','gradientboost')):
        continue
    if 'girls' in titles_total[ix]:
        plt.plot(recall_total[ix], prec_total[ix], linestyle='-', label=titles_total[ix]+' - AUC={:0.2f}'.format(auc_list[ix][0]))
    else:
        continue
plt.ylabel('Precision (PPV)', fontsize = 14)
plt.legend(fontsize = 8, loc=8)
plt.axis('equal')
plt.title('Girls Precision-Recall Curve: Obesity Predicted at 5 years', fontsize = 18)
plt.grid()
plt.tight_layout()

plt.subplot(1, 2, 2)
for ix in range(len(prec_total)):
    if any(x in titles_total[ix] for x in ('w/o vitals','Wt','no_maternal','w/o exclusions', 'BMI','gradientboost')):
        continue
    if 'boys' in titles_total[ix]:
        plt.plot(recall_total[ix], prec_total[ix], linestyle='-', label=titles_total[ix]+' - AUC={:0.2f}'.format(auc_list[ix][0]))
    else:
        continue
plt.legend(fontsize = 8, loc=8)
plt.axis('equal')
plt.xlabel('Recall (Sensitivity)', fontsize = 14)
plt.title('Boys Precision-Recall Curve: Obesity Predicted at 5 years', fontsize = 18)
plt.grid()
plt.tight_layout()
# plt.savefig(newdir+'/Pediatric_Girls_PR.png', dpi=300)
plt.show()

Results for the best model each prediction age

In [32]:
plt.figure(figsize=(9,9))
for l in top_ix:
    if 'girls' in titles_total[l[2]]:
        plt.plot(1- np.array(spec_total[l[2]]), np.array(recall_total[l[2]]), linestyle='-', label=titles_total[l[2]]+' - AUC={:0.2f}'.format(auc_list[l[2]][0]))
    else:
        plt.plot(1- np.array(spec_total[l[2]]), np.array(recall_total[l[2]]), linestyle='--', label=titles_total[l[2]]+' - AUC={:0.2f}'.format(auc_list[l[2]][0]))

plt.legend(fontsize = 10)
plt.xlabel('1 - Specificity', fontsize=14)
plt.ylabel('Sensitivity', fontsize=14)
plt.axis('equal')
plt.title('ROC Curve: Obesity Predicted at 5 years', fontsize=18)
plt.grid(True)
plt.tight_layout()
# plt.savefig(newdir+'/Pediatric_Girls_ROC.png', dpi=300)
plt.show()
In [33]:
plt.figure(figsize=(9,9))
for l in top_ix:
    if 'girls' in titles_total[l[2]]:
        plt.plot(recall_total[l[2]], prec_total[l[2]], linestyle='-', label=titles_total[l[2]]+' - AUC={:0.2f}'.format(auc_list[l[2]][0]))
    else:
        plt.plot(recall_total[l[2]], prec_total[l[2]], linestyle='--', label=titles_total[l[2]]+' - AUC={:0.2f}'.format(auc_list[l[2]][0]))
plt.xlabel('Recall (Sensitivity)', fontsize=14)
plt.ylabel('Precision (PPV)', fontsize=14)
plt.legend(fontsize = 10)
plt.axis('equal')
plt.title('Precision-Recall Curve: Obesity Predicted at 5 years', fontsize=18)
plt.grid()
plt.tight_layout()
# plt.savefig(newdir+'/Pediatric_Girls_PR.png', dpi=300)
plt.show()
In [34]:
plt.figure(figsize=(20,10))
plt.subplot(1, 2, 1)
for l in top_ix:
    if 'girls' in titles_total[l[2]]:
        plt.plot(recall_total[l[2]], prec_total[l[2]], linestyle='-', label=titles_total[l[2]]+' - AUC={:0.2f}'.format(auc_list[l[2]][0]))
    else:
        continue
plt.ylabel('Precision (PPV)', fontsize = 14)
plt.legend(fontsize=10, loc=8)
plt.axis('equal')
plt.title('Girls Precision-Recall Curve: Obesity Predicted at 5 years', fontsize = 18)
plt.grid()
plt.tight_layout()

plt.subplot(1, 2, 2)
for l in top_ix:
    if 'boys' in titles_total[l[2]]:
        plt.plot(recall_total[l[2]], prec_total[l[2]], linestyle='-', label=titles_total[l[2]]+' - AUC={:0.2f}'.format(auc_list[l[2]][0]))
    else:
        continue
plt.legend(fontsize=10, loc=8)
plt.axis('equal')
plt.xlabel('Recall (Sensitivity)', fontsize = 14)
plt.title('Boys Precision-Recall Curve: Obesity Predicted at 5 years', fontsize = 18)
plt.grid()
plt.tight_layout()
# plt.savefig(newdir+'/Pediatric_Girls_PR.png', dpi=300)
plt.show()

Comparison of Mary Jo's Inclusion Criteria to Pediatric Study

  • Using the best model for boys and girsl where prediction is at 24 months
  • Original: base parameters for the pediatric study where no inclusion criteria is used
  • Inclusion: uses Mary Jo's inclusion criteria
    • Age of mother at birth is at least 18 years old
    • No complications with the child delivery
    • Mother is Hispanic/Latina
  • Modified Inclusion: the same as above but all ethnicities are allowed
In [49]:
plt.figure(figsize=(9,9))
for i,title in enumerate(titles):
    if 'original' in titles[i].lower():
        plt.plot(1- np.array(spec_24[i]), np.array(recall_24[i]), linestyle='-', label=title+' - AUC={:0.2f}'.format(auc_24[i]))
    elif 'inclusion mod' in titles[i].lower():
        plt.plot(1- np.array(spec_24[i]), np.array(recall_24[i]), linestyle=':', label=title+' - AUC={:0.2f}'.format(auc_24[i]))
    else:
        plt.plot(1- np.array(spec_24[i]), np.array(recall_24[i]), linestyle='--', label=title+' - AUC={:0.2f}'.format(auc_24[i]))

plt.legend(fontsize = 10)
plt.xlabel('1 - Specificity', fontsize=14)
plt.ylabel('Sensitivity', fontsize=14)
plt.axis('equal')
plt.title('ROC Curve: Obesity Predicted at 5 years', fontsize=18)
plt.grid(True)
plt.tight_layout()
# plt.savefig(newdir+'/Pediatric_Girls_ROC.png', dpi=300)
plt.show()
In [46]:
plt.figure(figsize=(9,9))
for i,title in enumerate(titles):
    if 'original' in titles[i].lower():
        plt.plot(recall_24[i], prec_24[i], linestyle='-', label=titles[i]+' - AUC={:0.2f}'.format(auc_24[i]))
    elif 'inclusion mod' in titles[i].lower():
        plt.plot(recall_24[i], prec_24[i], linestyle=':', label=titles[i]+' - AUC={:0.2f}'.format(auc_24[i]))
    else:
        plt.plot(recall_24[i], prec_24[i], linestyle='--', label=titles[i]+' - AUC={:0.2f}'.format(auc_24[i]))
plt.xlabel('Recall (Sensitivity)', fontsize=14)
plt.ylabel('Precision (PPV)', fontsize=14)
plt.legend(fontsize = 10)
plt.axis('equal')
plt.title('Precision-Recall Curve: Obesity Predicted at 5 years', fontsize=18)
plt.grid()
plt.tight_layout()
# plt.savefig(newdir+'/Pediatric_Girls_PR.png', dpi=300)
plt.show()

Next Steps

  • Rerun analyses at each of the prediction time points with:
    • all data
    • all data excluding weight and BMI readings
    • all data excluding census data
    • predict for extreme obesity (99%?)
  • Update address data from the most recent data release
    • Will need to be geocoded
    • Census features for at birth, 2 years old (or month we are predicting from), 1 year before birth?
  • Report back on results with cleaned up plots